import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pickle
import arviz as az
import pymc3 as pm
from matplotlib.colors import to_rgb
import scipy.stats as stats
from IPython.display import display
import matplotlib as mpl
%load_ext autoreload
%autoreload 2
import plotting_lib
writeOut = True
outPathPlots = "../plots/statistical_model_neweplsar_filter_weak/"
outPathData = "../derived_data/statistical_model_neweplsar_filter_weak/"
prefix = "NewEplsar_filter_weak"
widthMM = 190
widthInch = widthMM / 25.4
ratio = 0.66666
heigthInch = ratio*widthInch
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
sns.set_style("ticks")
dpi = 300
sizes = [SMALL_SIZE,MEDIUM_SIZE,BIGGER_SIZE]
numSamples = 1000
numCores = 4
numTune = 1000
numPredSamples = 2000
random_seed=3651456135
datafile = "../derived_data/preprocessing/preprocessed_filter_weak.dat"
with open(datafile, "rb") as f:
x1,x2,_,df,dataZ,dictMeanStd,dictTreatment,dictSoftware = pickle.load(f)
Show that everything is correct:
display(pd.DataFrame.from_dict({'x1':x1,'x2':x2}))
x1 indicates the software used, x2 indicates the treatment applied.
for surfaceParam,(mean,std) in dictMeanStd.items():
print("Surface parameter {} has mean {} and standard deviation {}".format(surfaceParam,mean,std))
for k,v in sorted(dictTreatment.items(), key=lambda x: x[0]):
print("Number {} encodes treatment {}".format(k,v))
for k,v in dictSoftware.items():
print("Number {} encodes software {}".format(k,v))
display(dataZ)
display(df)
dfNewAvail = df[~df.NewEplsar.isna()].copy()
We look at the overall relationship between both epLsar variants on ConfoMap.
yrange = [0.015,0.022]
xrange = [ -0.0005 , 0.0085]
ax = sns.scatterplot(data=dfNewAvail,x='epLsar',y='NewEplsar');
ax.set_xlim(xrange);
ax.set_ylim(yrange);
Could be linear, but there is also a lot of noise.
Maybe different treatments have different behavior?
ax = sns.scatterplot(data=dfNewAvail,x='epLsar',y='NewEplsar',hue="Treatment");
ax.set_xlim(xrange);
ax.set_ylim(yrange);
Too crowded, let's try it per dataset
ax = sns.scatterplot(data=dfNewAvail[dfNewAvail.Dataset == "Sheeps"],x='epLsar',y='NewEplsar',hue="Treatment");
ax.set_xlim(xrange);
ax.set_ylim(yrange);
ax = sns.scatterplot(data=dfNewAvail[dfNewAvail.Dataset == "GuineaPigs"],x='epLsar',y='NewEplsar',hue="Treatment");
ax.set_xlim(xrange);
ax.set_ylim(yrange);
ax = sns.scatterplot(data=dfNewAvail[dfNewAvail.Dataset == "Lithics"],x='epLsar',y='NewEplsar',hue="Treatment");
ax.set_xlim(xrange);
ax.set_ylim(yrange);
dfNewAvail.NewEplsar
mu = dfNewAvail.NewEplsar.mean()
mu
sig = dfNewAvail.NewEplsar.std()
sig
dictMeanStd['NewEplsar'] = (mu,sig)
newZ = (dfNewAvail.NewEplsar.values-mu) / sig
newIndex = df[~df.NewEplsar.isna()].index.values
class Model_NewEplsar(pm.Model):
"""
Compute params of priors and hyperpriors.
"""
def getParams(self,x2,y):
# get lengths
Nx2Lvl = np.unique(x2).size
dims = (Nx2Lvl)
### get standard deviations
# convert to pandas dataframe to use their logic
df = pd.DataFrame.from_dict({'x2':x2,'y':y})
s2 = df.groupby('x2').std()['y'].max()
stdSingle = s2
return (dims, stdSingle)
def printParams(self,x2,y):
dims, stdSingle= self.getParams(x2,y)
Nx2Lvl = dims
s2 = stdSingle
print("The number of levels of the x variables are {}".format(dims))
print("The standard deviations used for the beta prior is {}".format(stdSingle))
def __init__(self,name,x2,y,model=None):
# call super's init first, passing model and name
super().__init__(name, model)
# get parameter of hyperpriors
dims, stdSingle = self.getParams(x2,y)
Nx2Lvl = dims
s2 = stdSingle
### hyperpriors ###
# observation hyperpriors
lamY = 1/30.
muGamma = 0.5
sigmaGamma = 2.
# prediction hyperpriors
sigma0 = pm.HalfNormal('sigma0',sd=1)
sigma2 = pm.HalfNormal('sigma2',sd=s2, shape=Nx2Lvl)
mu_b0 = pm.Normal('mu_b0', mu=0., sd=1)
mu_b2 = pm.Normal('mu_b2', mu=0., sd=1, shape=Nx2Lvl)
beta2 = 1.5*(np.sqrt(6)*sigma2)/(np.pi)
### priors ###
# observation priors
nuY = pm.Exponential('nuY',lam=lamY)
sigmaY = pm.Gamma('sigmaY',mu=muGamma, sigma=sigmaGamma)
# prediction priors
b0_dist = pm.Normal('b0_dist', mu=0, sd=1)
b0 = pm.Deterministic("b0", mu_b0 + b0_dist * sigma0)
b2_beta = pm.HalfNormal('b2_beta', sd=beta2, shape=Nx2Lvl)
b2_dist = pm.Gumbel('b2_dist', mu=0, beta=1)
b2 = pm.Deterministic("b2", mu_b2 + b2_beta * b2_dist)
#### prediction ###
mu = pm.Deterministic('mu',b0 + b2[x2])
### observation ###
y = pm.StudentT('y',nu = nuY, mu=mu, sd=sigmaY, observed=y)
with pm.Model() as model:
new_epLsarModel = Model_NewEplsar('NewEplsar',x2[newIndex],newZ)
new_epLsarModel.printParams(x2[newIndex],newZ)
try:
graph_new_epLsar = pm.model_to_graphviz(new_epLsarModel)
except:
graph_new_epLsar = "Could not make graph"
graph_new_epLsar
with new_epLsarModel as model:
prior_pred_new_epLsar = pm.sample_prior_predictive(samples=numPredSamples,random_seed=random_seed)
plotting_lib.plotPriorPredictive(widthInch,heigthInch,dpi,writeOut,outPathPlots,dfNewAvail.reset_index(),\
dictMeanStd,prior_pred_new_epLsar,newZ,'NewEplsar',prefix)
Prior choice is as intended: Broad over the data range.
with new_epLsarModel as model:
trace_new_epLsar = pm.sample(numSamples,cores=numCores,tune=numTune,max_treedepth=20, init='auto',target_accept=0.99,random_seed=random_seed)
with new_epLsarModel as model:
if writeOut:
with open(outPathData + '{}_model_{}.pkl'.format(prefix,'NewEplsar'), 'wb') as buff:
pickle.dump({'model':new_epLsarModel, 'trace': trace_new_epLsar}, buff)
with new_epLsarModel as model:
dataTrace_new_epLsar = az.from_pymc3(trace=trace_new_epLsar)
pm.summary(dataTrace_new_epLsar,hdi_prob=0.95).round(2)
az.plot_forest(dataTrace_new_epLsar,var_names=['b0','b2'],filter_vars='like',figsize=(widthInch,5*heigthInch),hdi_prob=0.95,ess=True,r_hat=True);
if writeOut:
plt.savefig(outPathPlots + "{}_posterior_forest_{}.pdf".format(prefix,'NewEplsar'),dpi=dpi)
with new_epLsarModel as model:
plotting_lib.plotTracesB(widthInch,heigthInch,dpi,writeOut,outPathPlots,trace_new_epLsar,'NewEplsar',prefix)
with new_epLsarModel as model:
plotting_lib.pm.energyplot(trace_new_epLsar)
with new_epLsarModel as model:
posterior_pred_new_epLsar = pm.sample_posterior_predictive(trace_new_epLsar,samples=numPredSamples,random_seed=random_seed)
plotting_lib.plotPriorPosteriorPredictive(widthInch,heigthInch,dpi,writeOut,\
outPathPlots,dfNewAvail.reset_index(),dictMeanStd,\
prior_pred_new_epLsar,posterior_pred_new_epLsar,newZ,'NewEplsar',prefix)
with new_epLsarModel as model:
pm_data_new_epLsar = az.from_pymc3(trace=trace_new_epLsar,prior=prior_pred_new_epLsar,posterior_predictive=posterior_pred_new_epLsar)
plotting_lib.plotPosterior(widthInch,heigthInch,dpi,writeOut,outPathPlots,dictMeanStd,\
pm_data_new_epLsar,'NewEplsar',prefix)
b1P_Old = np.load("../derived_data/statistical_model_two_factors_filter_weak/statistical_model_two_factors_filter_weak_epLsar_oldb1.npy")
b2P_Old = np.load("../derived_data/statistical_model_two_factors_filter_weak/statistical_model_two_factors_filter_weak_epLsar_oldb2.npy")
M12P_Old = np.load("../derived_data/statistical_model_two_factors_filter_weak/statistical_model_two_factors_filter_weak_epLsar_oldM12.npy")
from collections import defaultdict
def plotTreatmentPosterior(widthInch,heigthInch,dpi,sizes,writeOut,path,dictMeanStd,dictTreatment,dictSoftware,trace,yname,x1,x3,prefix):
SMALL_SIZE,MEDIUM_SIZE,BIGGER_SIZE = sizes
mu_Val,sig_Val = dictMeanStd[yname]
# get posterior samples
b2P = sig_Val*trace['{}_b2'.format(yname)]
# prepare color dict for treatments
# use groups of 4 colors, as in tab20c
colorIndex = dict({5:0,6:1,7:2,0:4,1:5,4:6,10:7,2:8,3:9,8:10,9:11})
# prepare dataset dict for treatments
dictDataset = dict({5:0,6:0,7:0,0:1,1:1,4:1,10:1,2:2,3:2,8:2,9:2})
# === inverse dict ====
inv_dictDataset = defaultdict(list)
# using loop to perform reverse mapping
for keys, vals in dictDataset.items():
for val in [vals]:
inv_dictDataset[val].append(keys)
# ===
# get number of datasets
numDatasets = len(np.unique(list(dictDataset.values())))
# get number of treatments per dataset
dictDataset2NumberTreats = dict()
for numDataset in range(numDatasets):
n = len(inv_dictDataset[numDataset])
dictDataset2NumberTreats[numDataset] = n
# Get maximum of treatments per dataset
tmax = np.max(list(dictDataset2NumberTreats.values()))
# compute maximal number of pairs
maxpair = int(tmax*(tmax-1)/2)
fig = plt.subplots(squeeze=False, figsize=(numDatasets*widthInch,maxpair*heigthInch), dpi=dpi);
# store list for hdi
hdiList = []
for indexDataset in np.arange(numDatasets):
# counter for row
rowCounter = 0
# first treatment
for treatmentNum_i,lvl2_i in enumerate(inv_dictDataset[indexDataset]):
# second treatment
for treatmentNum_j,lvl2_j in enumerate(inv_dictDataset[indexDataset]):
if treatmentNum_i > treatmentNum_j:
# set subplot
curr_ax = plt.subplot2grid((maxpair, numDatasets), (rowCounter,indexDataset))
# compute difference between treatments for each software
diffS0 = sig_Val*((M12P_Old[:,0,lvl2_i]+b2P_Old[:,lvl2_i]) -(M12P_Old[:,0,lvl2_j]+b2P_Old[:,lvl2_j]))
diffS1 = sig_Val*((M12P_Old[:,1,lvl2_i]+b2P_Old[:,lvl2_i]) -(M12P_Old[:,1,lvl2_j]+b2P_Old[:,lvl2_j]))
#plot posterior
sns.kdeplot(diffS1,ax=curr_ax,label="epLsar on {}".format(dictSoftware[1]),color='gray',alpha=0.3,ls='--');
sns.kdeplot(diffS0,ax=curr_ax,label="epLsar on {}".format(dictSoftware[0]),color='gray',alpha=0.3,ls='dotted');
sns.kdeplot(b2P[:,lvl2_i]-b2P[:,lvl2_j],ax=curr_ax,label="NewEplsar on {}".format(dictSoftware[0]),color='C0',alpha=0.3,ls='dotted');
# plot reference value zero
curr_ax.axvline(x=0,color="C1")
# get hdi
hdi_new = az.hdi(az.convert_to_inference_data(b2P[:,lvl2_i]-b2P[:,lvl2_j]),hdi_prob=0.95)['x'].values
hdiS0 = az.hdi(az.convert_to_inference_data(diffS0),hdi_prob=0.95)['x'].values
hdiS1 = az.hdi(az.convert_to_inference_data(diffS1),hdi_prob=0.95)['x'].values
isSignificant = lambda x: (x[0] > 0.0) or (x[1] < 0.0)
# store hdi
hdiList.append([dictTreatment[lvl2_i],dictTreatment[lvl2_j],
hdi_new[0],hdi_new[1],isSignificant(hdi_new),
hdiS0[0],hdiS0[1],isSignificant(hdiS0),
hdiS1[0],hdiS1[1],isSignificant(hdiS1)
])
# set title
nameFirst = dictTreatment[lvl2_i]
nameSecond = dictTreatment[lvl2_j]
title = "{} vs. {}".format(nameFirst,nameSecond)
if isSignificant(hdi_new):
title += ": Significant on NewEplsar"
curr_ax.set_title(title)
# add legend
curr_ax.legend()
# set x label
curr_ax.set_xlabel('Delta')
# remove y label decoration
curr_ax.tick_params(left=False)
curr_ax.set(yticklabels=[])
# increment counter
rowCounter += 1
#plt.suptitle('Estimated differences between treatments on {}'.format(yname))
plt.tight_layout()
if writeOut:
plt.savefig(path + "{}_treatment_pairs_{}.pdf".format(prefix,yname),dpi=dpi)
plt.show()
# convert hdi to df
df = pd.DataFrame(hdiList,columns=["Treatment_i","Treatment_j",
"hdi_NewEplsar_2.5%","hdi_NewEplsar_97.5%","isSignificant_NewEplsar","hdi_{}_2.5%".format(dictSoftware[0]),"hdi_{}_97.5%".format(dictSoftware[0]),"isSignificant_on_{}".format(dictSoftware[0]),
"hdi_{}_2.5%".format(dictSoftware[1]),"hdi_{}_97.5%".format(dictSoftware[1]),"isSignificant_on_{}".format(dictSoftware[1])])
return df
dfHDI = plotTreatmentPosterior(widthInch,heigthInch,dpi,sizes,writeOut,outPathPlots,dictMeanStd,dictTreatment,dictSoftware,trace_new_epLsar,'NewEplsar',x1[newIndex],x2[newIndex],prefix)
dfHDI
if writeOut:
dfHDI.to_csv(outPathData+ '{}_hdi_{}_filter_weak.csv'.format(prefix,'NewEplsar'))
Show where NewEplsar yields other results then epLsar:
df_summary = dfHDI[(dfHDI.isSignificant_NewEplsar != dfHDI.isSignificant_on_ConfoMap) | (dfHDI.isSignificant_NewEplsar != dfHDI.isSignificant_on_Toothfrax) ][["Treatment_i","Treatment_j","isSignificant_NewEplsar","isSignificant_on_Toothfrax","isSignificant_on_ConfoMap","hdi_NewEplsar_2.5%","hdi_NewEplsar_97.5%","hdi_ConfoMap_2.5%","hdi_ConfoMap_97.5%","hdi_Toothfrax_2.5%","hdi_Toothfrax_97.5%"]]
df_summary
if writeOut:
df_summary.to_csv(outPathData+ 'neweplsar_summary_filter_weak.csv')
!jupyter nbconvert --to html Statistical_Model_NewEplsar_filter_weak.ipynb
!jupyter nbconvert --to markdown Statistical_Model_NewEplsar_filter_weak.ipynb